This markdown file is performing clincluster on secretory cells, generating the clusters and finally plot the heatmap.
last date checked: 14 Nov 2019
First we read in the processed dataset.
sceset <- readRDS("../../scFT-paper_rds/20190214_allFT_Clincluster_12clusters_sceset_withUMAP.rds")
# load("../RData/20180725_allFT_Clincluster_12clusters.RData")
# saveRDS(sceset, "../rds/20190214_allFT_Clincluster_12clusters_sceset_withUMAP.rds", compress = T)
# write.csv(as.data.frame(sceset@colData), "../GEO/sceset_colData.csv")
# write.table(counts(sceset), "../GEO/sceset_counts.txt", quote = F, sep = "\t")
dim(sceset)
## [1] 22110 3877
plotTSNE(sceset, colour_by = "final.clusters")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
# UMAP
# too very long time to run
# set.seed(1234)
# sceset <- runUMAP(sceset, ncomponents = 2, feature_set = rowData(sceset)$high.var == T,
# exprs_values = "logcounts", scale_features = TRUE)
# plotUMAP(sceset, colour_by = "Patient2") + xlab("UMAP_1") + ylab("UMAP_2")
plotUMAP(sceset, colour_by = "final.clusters") + xlab("UMAP_1") + ylab("UMAP_2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
Plot the UMAP of fresh cells only
p1 <- plotUMAP(sceset[,sceset$source == "Fresh"], colour_by = "Patient") + xlab("UMAP_1") + ylab("UMAP_2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p2 <- plotUMAP(sceset[,sceset$source == "Fresh"], colour_by = "EPCAM") + xlab("UMAP_1") + ylab("UMAP_2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p3 <- plotUMAP(sceset[,sceset$source == "Fresh"], colour_by = "KRT7") + xlab("UMAP_1") + ylab("UMAP_2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
cowplot::plot_grid(p1,p2, p3, ncol = 3)
# ggsave("plots/Fig1_UMAP_EPCAMandKRT7_fresh_20190510.png", width = 11, height = 2.6)
p1 <- plotPCA(sceset[,sceset$source == "Fresh" & sceset$Patient != "15066L"], colour_by = "KRT7") + xlab("PC1") + ylab("PC2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p2 <- plotPCA(sceset[,sceset$source == "Fresh" & sceset$Patient != "15066L"], colour_by = "PAX8") + xlab("PC1") + ylab("PC2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p3 <- plotPCA(sceset[,sceset$source == "Fresh" & sceset$Patient != "15066L"], colour_by = "CCDC17") + xlab("PC1") + ylab("PC2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p4 <- plotPCA(sceset[,sceset$source == "Fresh" & sceset$Patient != "15066L"], colour_by = "CAPS") + xlab("PC1") + ylab("PC2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
cowplot::plot_grid(p1,p2,p3,p4)
# ggsave("plots/SuppFig2_intermediate_PCAs.png")
## select the fresh secretory cells
secretory <- sceset[,sceset$final.clusters %in% c(8,9,10) & sceset$source == "Fresh" & sceset$Patient != "15066L"] # dim(secretory)
# [1] 22110 1747
secretory <- secretory[ ,logcounts(secretory)["KRT7",] > 2 &
logcounts(secretory)["EPCAM",] > 2 &
logcounts(secretory)["PTPRC",] == 0 &
logcounts(secretory)["CCDC17",] < 1 ]
dim(secretory)
## [1] 22110 1410
# [1] 22110 1410
# not run
ciliated <- sceset[,sceset$final.clusters %in% c(11) & sceset$source == "Fresh" & sceset$Patient != "15066L"]
ciliated <- ciliated[ ,logcounts(ciliated)["KRT7",] <= 2 &
logcounts(ciliated)["EPCAM",] > 2 &
logcounts(ciliated)["PTPRC",] == 0 &
logcounts(ciliated)["CCDC17",] >= 1 ]
dim(ciliated)
matrix <- expm1(cbind(logcounts(secretory), logcounts(ciliated)))
keep <- rowSums(matrix > 1) > 5
sum(keep)
dge <- edgeR::DGEList(counts = matrix[keep,]) # make a edgeR object
rm(matrix,keep)
group <- c(rep("SC",1410), rep("CC", 91))
patient <- c(secretory$Patient2, ciliated$Patient2)
design <- model.matrix(~ 0+group + patient)
v <- voom(dge, design, plot = TRUE)
fit <- lmFit(v, design)
cont.matrix <- makeContrasts(contrasts = "groupSC-groupCC",levels=colnames(design))
fit <- contrasts.fit(fit, cont.matrix)
fit <- eBayes(fit)
rls <- topTable(fit, n = Inf, coef = 1, sort = "logFC", lfc = 1, p = 0.05 )
rls$gene <- rownames(rls)
# write.csv(rls, "../tables/TableS4_markers_secretory_ciliated20190214.csv", row.names = F)
source("clincluster/clincluster_functions.R")
secretory <- PrepareData(secretory, col.for.cluster = "Patient2", do.scale = T)
## [1] "Normalising the data..."
## [1] "Centre the data"
secretory <- HighVarGenes(secretory)
table(rowData(secretory)$high.var)
##
## FALSE TRUE
## 19721 2389
# FALSE TRUE
# 19721 2389
ggplot(data = data.frame(gene.mean = rowData(secretory)$gene.mean,
gene.dispersion = rowData(secretory)$gene.dispersion,
high.var = rowData(secretory)$high.var),
aes(x = gene.mean, y = gene.dispersion, col = high.var) ) +
geom_point(alpha=0.4)
set.seed(1234)
secretory <- runTSNE(object = secretory, ncomponents = 2,
feature_set = rownames(secretory)[rowData(secretory)$high.var],
exprs_values = "logcounts",
perplexity = min(50, floor(ncol(secretory)/5)))
the tSNE from log-transformed data is better then the centred data.
Calculate 20 PCs from high variance genes and the log-transformed counts.
set.seed(12345)
secretory <- runPCA(object = secretory, ncomponents = 20,
exprs_values = "logcounts", rand_seed = 12345,
feature_set = rownames(secretory)[rowData(secretory)$high.var == TRUE])
plotPCA(secretory)
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
Plot the variance by PCs.
plot(1:50, (attr(secretory@reducedDims$PCA, "percentVar")[1:50])*100, pch = 20, xlab = "PC", ylab = "Standard Deviation of PC")
specClust alllows to estimate several popular spectral clustering algorithms, for an overview see von Luxburg (2007).
The Laplacian is constructed from a from nearest neighbors and there are several kernels available. The eigenvalues and eigenvectors are computed using the binding in igraph to arpack. This should ensure that this algorithm is also feasable for larger datasets as the the the distances used have dimension n*m, where n is the number of observations and m the number of nearest neighbors. The Laplacian is sparse and has roughly n*m elements and only k eigenvectors are computed, where k is the number of centers.
set.seed(123456)
secretory <- InitialCluster(secretory, k = c(4,6,6,6,6), ncomponents = 1:12, n.neighbor = 7, spec.method = "kknn")
## [1] "Got data ready"
## [1] "Initial clustering"
##
|
| | 0%
|
|============= | 20%
|
|========================== | 40%
|
|======================================= | 60%
|
|==================================================== | 80%
|
|=================================================================| 100%
plotTSNE(secretory, colour_by = "Patient2")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
table(secretory$initial.cluster)
##
## 11543.1 11543.2 11543.3 11543.4 11545.1 11545.2 11545.3 11545.4 11545.5
## 16 88 56 9 81 86 71 82 18
## 11545.6 11553.1 11553.2 11553.3 11553.4 11553.5 11553.6 15066.1 15066.2
## 63 9 90 114 109 23 14 26 9
## 15066.3 15066.4 15066.5 15066.6 15072.1 15072.2 15072.3 15072.4 15072.5
## 28 54 48 85 46 13 32 29 47
## 15072.6
## 64
p1 <- plotTSNE(secretory[,secretory$Patient2 == 11543], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p2 <- plotTSNE(secretory[,secretory$Patient2 == 11545], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p3 <- plotTSNE(secretory[,secretory$Patient2 == 11553], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p4 <- plotTSNE(secretory[,secretory$Patient2 == 15066], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
p5 <- plotTSNE(secretory[,secretory$Patient2 == 15072], colour_by = "initial.cluster")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
cowplot::plot_grid(p1,p2,p3,p4,p5,nrow = 2)
matrix <- expm1(logcounts(secretory))
keep <- rowSums(matrix > 1) > 5
sum(keep)
## [1] 15508
# 15508
dge <- edgeR::DGEList(counts = matrix[keep,]) # make a edgeR object
rm(matrix,keep)
secretory@colData$Patient2 <- as.factor(secretory@colData$Patient2)
design <- model.matrix(~ 0 + initial.cluster, data = secretory@colData) # Use 0 because we do not need intercept for this linear model
design2 <- model.matrix(~ 0 + Patient2, data = secretory@colData)
colnames(design)
## [1] "initial.cluster11543.1" "initial.cluster11543.2"
## [3] "initial.cluster11543.3" "initial.cluster11543.4"
## [5] "initial.cluster11545.1" "initial.cluster11545.2"
## [7] "initial.cluster11545.3" "initial.cluster11545.4"
## [9] "initial.cluster11545.5" "initial.cluster11545.6"
## [11] "initial.cluster11553.1" "initial.cluster11553.2"
## [13] "initial.cluster11553.3" "initial.cluster11553.4"
## [15] "initial.cluster11553.5" "initial.cluster11553.6"
## [17] "initial.cluster15066.1" "initial.cluster15066.2"
## [19] "initial.cluster15066.3" "initial.cluster15066.4"
## [21] "initial.cluster15066.5" "initial.cluster15066.6"
## [23] "initial.cluster15072.1" "initial.cluster15072.2"
## [25] "initial.cluster15072.3" "initial.cluster15072.4"
## [27] "initial.cluster15072.5" "initial.cluster15072.6"
v <- voom(dge, design, plot = F)
fit <- lmFit(v, design) # Linear Model for Series of Arrays
initial.clusters <- data.frame(id = colnames(design),
short_id = gsub(pattern = "initial.cluster",
replacement = "", x = colnames(design)),
patient = substr(colnames(design), start = 16, stop = 20))
head(initial.clusters)
## id short_id patient
## 1 initial.cluster11543.1 11543.1 11543
## 2 initial.cluster11543.2 11543.2 11543
## 3 initial.cluster11543.3 11543.3 11543
## 4 initial.cluster11543.4 11543.4 11543
## 5 initial.cluster11545.1 11545.1 11545
## 6 initial.cluster11545.2 11545.2 11545
## Automating makeContrasts call in limma
nc <- nrow(initial.clusters)
contrast_all <- gtools::permutations(v = as.character(initial.clusters$id), n = nc, r = 2)
contrast_all <- as.data.frame(contrast_all)
head(contrast_all)
## V1 V2
## 1 initial.cluster11543.1 initial.cluster11543.2
## 2 initial.cluster11543.1 initial.cluster11543.3
## 3 initial.cluster11543.1 initial.cluster11543.4
## 4 initial.cluster11543.1 initial.cluster11545.1
## 5 initial.cluster11543.1 initial.cluster11545.2
## 6 initial.cluster11543.1 initial.cluster11545.3
table(secretory$initial.cluster)
##
## 11543.1 11543.2 11543.3 11543.4 11545.1 11545.2 11545.3 11545.4 11545.5
## 16 88 56 9 81 86 71 82 18
## 11545.6 11553.1 11553.2 11553.3 11553.4 11553.5 11553.6 15066.1 15066.2
## 63 9 90 114 109 23 14 26 9
## 15066.3 15066.4 15066.5 15066.6 15072.1 15072.2 15072.3 15072.4 15072.5
## 28 54 48 85 46 13 32 29 47
## 15072.6
## 64
initial.clusters$n_cells <- table(secretory$initial.cluster)
n_cells_patients <- table(secretory$Patient2)
initial.clusters$n_cells_patients <- n_cells_patients[match(initial.clusters$patient, names(n_cells_patients))]
initial.clusters$weight_cluster <- initial.clusters$n_cells/initial.clusters$n_cells_patients
initial.clusters$paste_weight_id <- paste(initial.clusters$id,"*",initial.clusters$weight_cluster, sep = "")
contrast_all$P1 <- substr(contrast_all$V1, start = 16, stop = 20) # patient 1
contrast_all$P2 <- substr(contrast_all$V2, start = 16, stop = 20) # patient 1
contrast_all$C1 <- NA
contrast_all$C2 <- NA
contrast_all$n_C1 <- NA
contrast_all$n_C2 <- NA
for(i in 1:nrow(contrast_all)) {
contrast_all$C1[i] <- paste(initial.clusters$paste_weight_id[initial.clusters$patient == contrast_all$P1[i]], collapse = "+")
contrast_all$C2[i] <- paste(initial.clusters$paste_weight_id[initial.clusters$patient == contrast_all$P2[i]], collapse = "+")
}
head(contrast_all)
## V1 V2 P1 P2
## 1 initial.cluster11543.1 initial.cluster11543.2 11543 11543
## 2 initial.cluster11543.1 initial.cluster11543.3 11543 11543
## 3 initial.cluster11543.1 initial.cluster11543.4 11543 11543
## 4 initial.cluster11543.1 initial.cluster11545.1 11543 11545
## 5 initial.cluster11543.1 initial.cluster11545.2 11543 11545
## 6 initial.cluster11543.1 initial.cluster11545.3 11543 11545
## C1
## 1 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 2 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 3 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 4 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 5 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 6 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## C2
## 1 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 2 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 3 initial.cluster11543.1*0.0946745562130177+initial.cluster11543.2*0.520710059171598+initial.cluster11543.3*0.331360946745562+initial.cluster11543.4*0.0532544378698225
## 4 initial.cluster11545.1*0.201995012468828+initial.cluster11545.2*0.214463840399002+initial.cluster11545.3*0.177057356608479+initial.cluster11545.4*0.204488778054863+initial.cluster11545.5*0.0448877805486284+initial.cluster11545.6*0.1571072319202
## 5 initial.cluster11545.1*0.201995012468828+initial.cluster11545.2*0.214463840399002+initial.cluster11545.3*0.177057356608479+initial.cluster11545.4*0.204488778054863+initial.cluster11545.5*0.0448877805486284+initial.cluster11545.6*0.1571072319202
## 6 initial.cluster11545.1*0.201995012468828+initial.cluster11545.2*0.214463840399002+initial.cluster11545.3*0.177057356608479+initial.cluster11545.4*0.204488778054863+initial.cluster11545.5*0.0448877805486284+initial.cluster11545.6*0.1571072319202
## n_C1 n_C2
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
contrast_matrix <- apply(contrast_all, MARGIN = 1, function(x) return(paste(x[1],"-",x[2],"-(",x[5],")","+(", x[6],")", sep = "")))
cont.matrix <- makeContrasts(contrasts = contrast_matrix,levels=design)
cont.matrix[,5]
## initial.cluster11543.1 initial.cluster11543.2 initial.cluster11543.3
## 0.90532544 -0.52071006 -0.33136095
## initial.cluster11543.4 initial.cluster11545.1 initial.cluster11545.2
## -0.05325444 0.20199501 -0.78553616
## initial.cluster11545.3 initial.cluster11545.4 initial.cluster11545.5
## 0.17705736 0.20448878 0.04488778
## initial.cluster11545.6 initial.cluster11553.1 initial.cluster11553.2
## 0.15710723 0.00000000 0.00000000
## initial.cluster11553.3 initial.cluster11553.4 initial.cluster11553.5
## 0.00000000 0.00000000 0.00000000
## initial.cluster11553.6 initial.cluster15066.1 initial.cluster15066.2
## 0.00000000 0.00000000 0.00000000
## initial.cluster15066.3 initial.cluster15066.4 initial.cluster15066.5
## 0.00000000 0.00000000 0.00000000
## initial.cluster15066.6 initial.cluster15072.1 initial.cluster15072.2
## 0.00000000 0.00000000 0.00000000
## initial.cluster15072.3 initial.cluster15072.4 initial.cluster15072.5
## 0.00000000 0.00000000 0.00000000
## initial.cluster15072.6
## 0.00000000
fit2 <- contrasts.fit(fit, cont.matrix) # Compute Contrasts from Linear Model Fit
fit2 <- eBayes(fit2)
The DE gene weight is decided by the fold change and the ratio of expression proportion.
## parameter:
## logFC = 0.6
## p-value = 0.05
## weight = abs(logFC)*(expr_ratio1+0.01)/(expr_ratio2+0.01)
## expr_ratio_max > 0.25
n_deg2 <- matrix(0, ncol = nc, nrow = nc) # number of DE genes
colnames(n_deg2) <- rownames(n_deg2) <- gsub(x = colnames(design)[1:nc], pattern = "initial.cluster",replacement = "")
for(i in 1:nc) {
for(j in 1:nc) {
if(i == j) {
n_deg2[i,j] <- 0
} else if (j < i) {
coef_k = (i-1)*(nc-1)+j
} else if (j > i) {
coef_k = (i-1)*(nc-1)+j-1
}
if(i != j) {
rls <- topTable(fit2, n = Inf, coef = coef_k, sort = "p", lfc = 0.6, p = 0.05 )
if(nrow(rls) > 1) {
v_expr <- logcounts(secretory)[rownames(rls), secretory$initial.cluster == rownames(n_deg2)[i]]
rls$ratio1 <- rowSums(v_expr > 0.5)/ncol(v_expr)
v_expr <- logcounts(secretory)[rownames(rls), secretory$initial.cluster == colnames(n_deg2)[j]]
rls$ratio2 <- rowSums(v_expr > 0.5)/ncol(v_expr)
rls$ratiomax <- rowMaxs(as.matrix(rls[,c("ratio1", "ratio2")]))
rls$ratiomin <- rowMins(as.matrix(rls[,c("ratio1", "ratio2")]))
rls <- rls[rls$ratiomax > 0.25, ]
n_deg2[i,j] <- sum(apply(rls, MARGIN = 1, function(x) return(abs(x[1]) * (x[9]+0.01)/(x[10]+0.01)))) ## 0.01 is used here to exaggerate the differences of on-off genes
} else if (nrow(rls) == 1) {
n_deg2[i,j] <- sum(rls$logFC)
}
## This eqaution take fold change and expression ratio into account
## Question: should we talk a upper limit to the weight?
}
}
}
# saveRDS(n_deg2, "../../scFT-paper_rds/clincluster_secretory_n_deg2_distMat191113.rds")
## pre-run results
n_deg2 <- readRDS("../../scFT-paper_rds/clincluster_secretory_n_deg2_distMat191113.rds")
# any(is.na(n_deg2))
n_deg2[1:5,1:5]
## 11543.1 11543.2 11543.3 11543.4 11545.1
## 11543.1 0.0000 1683.9316 3216.994 1582.100 663.2267
## 11543.2 1683.9316 0.0000 6382.608 1834.765 461.0399
## 11543.3 3216.9943 6382.6082 0.000 11271.148 1909.0925
## 11543.4 1582.0999 1834.7651 11271.148 0.000 3145.7421
## 11545.1 663.2267 461.0399 1909.092 3145.742 0.0000
## 7 clusters
hc <- hclust(as.dist(n_deg2))
hc.cluster <- cutree(hc, k = 10)
colData(secretory)$clincluster.7clusters <- hc.cluster[match(colData(secretory)$initial.cluster, names(hc.cluster))]
# secretory$clincluster.7clusters[secretory$initial.cluster == "11553.2"] <- 8
colData(secretory)$clincluster.7clusters <- as.factor(colData(secretory)$clincluster.7clusters)
table(colData(secretory)$clincluster.7clusters)
##
## 1 2 3 4 5 6 7 8 9 10
## 92 228 167 32 312 272 154 40 90 23
## visualisation
hc <- hclust(as.dist(n_deg2))
plot(hc);rect.hclust(hc, k = 10, border = "red")
plotTSNE(secretory, colour_by = "clincluster.7clusters")
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
# tiff("../manuscript_plots/FigureS6A_Clustering_FTESCs_tSNE.tiff", res = 300, height = 10, width = 14, units = "cm")
# plotTSNE(secretory, colour_by = "clincluster")
# dev.off()
#
# tiff("../manuscript_plots/FigureS6B_Clustering_FTESCs_tSNE_patients.tiff", res = 300, height = 10, width = 14, units = "cm")
# plotTSNE(secretory, colour_by = "Patient2")
# dev.off()
secretory$clincluster_final <- secretory$clincluster.7clusters
secretory$clincluster_final[secretory$clincluster.7clusters == 7] <- 4
secretory$clincluster_final <- paste("C",secretory$clincluster_final,sep = "")
markers2 <- c()
logcounts <- logcounts(secretory)
for(i in 1:length(unique(secretory$clincluster_final))){
info <- rep("control", ncol(secretory))
info[secretory$clincluster_final == unique(secretory$clincluster_final)[i]] <- "group"
design <- model.matrix(~ 0 + info)
v <- voom(dge, design, plot = F)
fit <- lmFit(v, design) # Linear Model for Series of Arrays
cont.matrix <- makeContrasts(contrasts = "infogroup-infocontrol",levels=design)
fit <- contrasts.fit(fit, cont.matrix ) # Linear Model for Series of Arrays
fit <- eBayes(fit)
marker <- topTable(fit, p.value = 0.05, number = Inf, coef = 1, lfc = 0.6, sort.by = "logFC")
marker <- marker[marker$logFC > 0.6,]
v_expr <- logcounts[match(rownames(marker), rownames(logcounts)), info == "group"]
marker$ratio1 <- rowSums(v_expr > 0.5)/ncol(v_expr)
v_expr <- logcounts[match(rownames(marker), rownames(logcounts)), info != "group"]
marker$ratio2 <- rowSums(v_expr > 0.5)/ncol(v_expr)
marker$gene <- rownames(marker)
marker$cluster <- unique(secretory$clincluster_final)[i]
markers2 <- rbind(markers2, marker)
}
markers2$cluster <- factor(markers2$cluster)
# write.csv(markers2, "../tables/20190120Clincluster_fresh_secretory_9clusters_markers.csv")
markers2 <- read.csv("../../scFT-paper_rds/20190120Clincluster_fresh_secretory_9clusters_markers.csv", as.is = T)
top10 <- markers2 %>% group_by(cluster) %>% top_n(3, logFC)
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
knitr::kable(top10)
| X | logFC | AveExpr | t | P.Value | adj.P.Val | B | ratio1 | ratio2 | gene | cluster |
|---|---|---|---|---|---|---|---|---|---|---|
| SERINC5 | 1.956063 | 7.107523 | 10.547525 | 0.0e+00 | 0.0000000 | 46.109691 | 0.9166667 | 0.7563452 | SERINC5 | C2 |
| SET | 1.501504 | 8.682341 | 14.337685 | 0.0e+00 | 0.0000000 | 88.053797 | 1.0000000 | 0.9822335 | SET | C2 |
| CDV3 | 1.426887 | 5.157615 | 7.548573 | 0.0e+00 | 0.0000000 | 20.828222 | 0.6710526 | 0.5490694 | CDV3 | C2 |
| OVGP1 | 1.415347 | 5.962832 | 5.222745 | 2.0e-07 | 0.0000462 | 6.760365 | 0.5608974 | 0.4262295 | OVGP1 | C5 |
| TRIB1 | 1.125414 | 6.411907 | 6.188506 | 0.0e+00 | 0.0000004 | 11.907277 | 0.8108974 | 0.6621129 | TRIB1 | C5 |
| FHL2 | 1.093554 | 5.512801 | 5.650759 | 0.0e+00 | 0.0000061 | 8.934340 | 0.6698718 | 0.5018215 | FHL2 | C5 |
| DPP4 | 2.243219 | 4.054546 | 9.317093 | 0.0e+00 | 0.0000000 | 34.637577 | 0.6766467 | 0.2622687 | DPP4 | C3 |
| LTBP4 | 2.102409 | 5.130780 | 8.724552 | 0.0e+00 | 0.0000000 | 29.709480 | 0.7964072 | 0.4489139 | LTBP4 | C3 |
| SLC25A25 | 2.074663 | 5.932589 | 9.006236 | 0.0e+00 | 0.0000000 | 32.029021 | 0.9161677 | 0.5929204 | SLC25A25 | C3 |
| JUN | 2.724953 | 6.173253 | 14.027896 | 0.0e+00 | 0.0000000 | 84.025000 | 0.8566176 | 0.5755712 | JUN | C6 |
| FOS | 2.510552 | 11.718297 | 16.909273 | 0.0e+00 | 0.0000000 | 122.037795 | 0.9963235 | 0.9630931 | FOS | C6 |
| TXNIP | 2.184585 | 3.680430 | 10.965286 | 0.0e+00 | 0.0000000 | 50.154916 | 0.5000000 | 0.1652021 | TXNIP | C6 |
| ACTA2 | 6.186113 | 3.808314 | 15.277895 | 0.0e+00 | 0.0000000 | 95.334245 | 0.8750000 | 0.2547445 | ACTA2 | C8 |
| TIMP3 | 5.489491 | 2.803985 | 12.653211 | 0.0e+00 | 0.0000000 | 64.819243 | 0.9000000 | 0.0802920 | TIMP3 | C8 |
| A2M | 4.561939 | 2.722170 | 9.437405 | 0.0e+00 | 0.0000000 | 34.280756 | 0.6750000 | 0.0525547 | A2M | C8 |
| KRT17 | 3.631127 | 5.782838 | 12.343843 | 0.0e+00 | 0.0000000 | 64.801783 | 0.7580645 | 0.3839869 | KRT17 | C4 |
| SNCG | 3.486426 | 6.076391 | 16.095046 | 0.0e+00 | 0.0000000 | 111.195687 | 0.9193548 | 0.5653595 | SNCG | C4 |
| PIGR | 3.355805 | 3.148991 | 15.322608 | 0.0e+00 | 0.0000000 | 100.904963 | 0.6774194 | 0.0808824 | PIGR | C4 |
| HLA-DRB51 | 1.901589 | 8.147843 | 7.993135 | 0.0e+00 | 0.0000000 | 23.753971 | 0.9782609 | 0.9044006 | HLA-DRB5 | C1 |
| ZC3H12A2 | 1.757681 | 8.082003 | 6.175338 | 0.0e+00 | 0.0000001 | 11.846008 | 0.9565217 | 0.8239757 | ZC3H12A | C1 |
| CTGF1 | 1.751632 | 7.709149 | 4.707172 | 2.8e-06 | 0.0000512 | 4.356458 | 0.8695652 | 0.7116844 | CTGF | C1 |
| STMN1 | 4.492856 | 4.766765 | 8.135457 | 0.0e+00 | 0.0000000 | 24.553233 | 0.9565217 | 0.4419611 | STMN1 | C10 |
| MCM71 | 4.318530 | 4.028890 | 7.192254 | 0.0e+00 | 0.0000000 | 17.940752 | 0.9130435 | 0.2941601 | MCM7 | C10 |
| OVGP11 | 4.038045 | 5.962832 | 4.990426 | 7.0e-07 | 0.0002998 | 5.636433 | 0.8260870 | 0.4498919 | OVGP1 | C10 |
| CXCL1 | 5.657529 | 5.751003 | 16.819535 | 0.0e+00 | 0.0000000 | 120.111966 | 0.9444444 | 0.4500000 | CXCL1 | C9 |
| SERPINA31 | 5.175707 | 4.339827 | 15.246068 | 0.0e+00 | 0.0000000 | 98.969586 | 0.8888889 | 0.2530303 | SERPINA3 | C9 |
| CCL22 | 5.119400 | 6.475462 | 15.195825 | 0.0e+00 | 0.0000000 | 98.531420 | 0.9666667 | 0.5734848 | CCL2 | C9 |
my_col <- RColorBrewer::brewer.pal(12, "Paired")[c(2,8,4,6,10,12,11,9,7,5,3,1)]
# secretory$clincluster_final <- as.factor(secretory$clincluster_final)
p1 <- plotTSNE(secretory[,secretory$Patient2 == 11543], colour_by = "clincluster_final") + scale_fill_manual( values = my_col[c(1,2,3,4)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
p2 <- plotTSNE(secretory[,secretory$Patient2 == 11545], colour_by = "clincluster_final") + scale_fill_manual( values = my_col[c(2,3,4,5,6,8)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
p3 <- plotTSNE(secretory[,secretory$Patient2 == 11553], colour_by = "clincluster_final") + scale_fill_manual(values = my_col[c(10,4,5,6,8,9)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
p4 <- plotTSNE(secretory[,secretory$Patient2 == 15066], colour_by = "clincluster_final") + scale_fill_manual( values = my_col[c(10,2,3,4,5,6)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
p5 <- plotTSNE(secretory[,secretory$Patient2 == 15072], colour_by = "clincluster_final") + scale_fill_manual(values = my_col[c(1,4,5,6,8)])
## Warning: 'add_ticks' is deprecated.
## Use '+ geom_rug(...)' instead.
## Scale for 'fill' is already present. Adding another scale for 'fill',
## which will replace the existing scale.
cowplot::plot_grid(p1,p2,p3,p4,p5,nrow = 3)
# ggsave("exprloratory_plots/20180121tSNE_Patients_9clusters.pdf", height = 8, width = 7, units = "in", dpi = 150)
table(secretory$clincluster_final, secretory$Patient2)
##
## 11543 11545 11553 15066 15072
## C1 16 0 0 0 76
## C10 0 0 14 9 0
## C2 88 86 0 54 0
## C3 56 63 0 48 0
## C4 9 82 23 26 46
## C5 0 81 114 85 32
## C6 0 71 109 28 64
## C8 0 18 9 0 13
## C9 0 0 90 0 0
# saveRDS(secretory, "../rds/20190120Fresh_secretory_9clusters_clincluster.rds", compress = T)
# secretory <- readRDS("../rds/20190120Fresh_secretory_9clusters_clincluster.rds")
# markers2 <- read.csv("../tables/20190120Clincluster_fresh_secretory_9clusters_markers.csv", as.is = T)
set.seed(1234)
dim(stroma)
## [1] 23710 91
matrix <- counts(secretory)[,secretory$clincluster_final == "C8"]
matrix <- cbind(matrix,counts(stroma)[match(rownames(matrix), rownames(stroma)),] )
matrix <- cpm(matrix)
dge <- edgeR::DGEList(counts = matrix[rowSums(matrix) > 5, ])
info <- c(rep("C8", 40), rep("str",91))
design <- model.matrix(~ 0 + info)
v <- voom(dge, design, plot = F)
fit <- lmFit(v, design) # Linear Model for Series of Arrays
cont.matrix <- makeContrasts(contrasts = "infoC8-infostr",levels=design)
fit <- contrasts.fit(fit, cont.matrix ) # Linear Model for Series of Arrays
fit <- eBayes(fit)
C8.m <- topTable(fit, p.value = 0.05, number = Inf, coef = 1, lfc = 0.6, sort.by = "logFC")
C8.m$gene <- rownames(C8.m)
which(markers2$gene[markers2$cluster == "C8"] %in% C8.m$gene[C8.m$logFC > 0])
## [1] 3 4 20 25 42 63 68 74 76 81 82 85 87 96 108 121 124
## [18] 133 157
matrix <- counts(secretory)[,secretory$clincluster_final == "C8"]
matrix <- cbind(matrix, counts(secretory)[,secretory$clincluster_final %in% c("C3","C4","C10")])
matrix <- cbind(matrix,counts(stroma)[match(rownames(matrix), rownames(stroma)),] )
info <- c(rep("C8",sum(secretory$clincluster_final == "C8")),
rep("secretory", sum(secretory$clincluster_final %in% c("C3","C4","C10"))),
rep("str",91))
matrix <- cpm(matrix)
dge <- edgeR::DGEList(counts = matrix[rowSums(matrix) > 5, ])
design <- model.matrix(~ 0 + info)
v <- voom(dge, design, plot = F)
fit <- lmFit(v, design) # Linear Model for Series of Arrays
cont.matrix <- makeContrasts(contrasts = c("infoC8-infosecretory","infostr-infosecretory","infostr-infoC8"),levels=design)
fit <- contrasts.fit(fit, cont.matrix) # Linear Model for Series of Arrays
fit <- eBayes(fit)
StrvsSec.m <- topTable(fit, p.value = 0.05, number = Inf, coef = 2, lfc = 0.6, sort.by = "logFC")
StrvsSec.m$gene <- rownames(StrvsSec.m)
C8vsSec.m <- topTable(fit, p.value = 0.05, number = Inf, coef = 1, lfc = 0.6, sort.by = "logFC")
C8vsSec.m$gene <- rownames(C8vsSec.m)
StrvsC8.m <- topTable(fit, p.value = 0.05, number = Inf, coef = 1, lfc = 0.6, sort.by = "logFC")
StrvsC8.m$gene <- rownames(StrvsC8.m)
stroma_control <- SingleCellExperiment(assays = list(counts = matrix ), colData = data.frame(Type = info))
logcounts(stroma_control) <- log1p(calculateCPM(stroma_control))
plotExpression(stroma_control, features = c(markers2$gene[markers2$cluster == "C8"][which(markers2$gene[markers2$cluster == "C8"] %in% C8.m$gene[C8.m$logFC > 0])]), x = "Type", ncol = 4, scales = "free")
plotExpression(stroma_control, features = c("COL1A2","COL3A1"), x = "Type", ncol = 2, scales = "free") +
scale_x_discrete(labels = c("EMT","Other FTESCs","Stroma"),breaks = c("C8","secretory","str")) +
theme(strip.background = element_rect(fill = "white"),strip.text.x = element_text(face = "italic", size = 12))
# ggsave("plots/SuppFig3_EMT_stroma_col1a.png", height = 3, width = 5)
plotExpression(stroma_control, features = c("SPARC","RGS16","COL1A2","COL3A1","EPCAM","KRT7"),
x = "Type", ncol = 2, scales = "free") +
scale_x_discrete(labels = c("EMT","Other FTESCs","Stroma"),breaks = c("C8","secretory","str")) +
theme(strip.background = element_rect(fill = "white"),strip.text.x = element_text(face = "italic", size = 12))
# ggsave("plots/SuppFig3_EMT_markers_with_stroma_control.png", width = 6, height = 8)
fresh <- sceset[,sceset$source == "Fresh"]
fresh$cell_type <- NA
fresh$cell_type[colnames(fresh) %in% colnames(sceset)[sceset$final.clusters == 11]] <- "Ciliated"
fresh$cell_type[colnames(fresh) %in% colnames(secretory)] <- "Secretory"
table(fresh$cell_type)
##
## Ciliated Secretory
## 146 1410
library(DoubletFinder) # 2.0.1
# devtools::install_version(package = 'Seurat', version = package_version('2.3.4'))
library(Seurat) # 2.3.4
## Loading required package: cowplot
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
fresh_seu <- Seurat::as.seurat(fresh)
sc16_seu <- Seurat::as.seurat(sc16)
sc16_seu@meta.data$Source <- "Fresh"
fresh_seu@meta.data <- fresh_seu@meta.data[,colnames(fresh_seu@meta.data) %in% colnames(sc16_seu@meta.data)]
sc16_seu@meta.data <- sc16_seu@meta.data[,match(colnames(fresh_seu@meta.data), colnames(sc16_seu@meta.data))]
fresh_seu@meta.data <- fresh_seu@meta.data [,c(1,2,3,4,5,21)]
sc16_seu@meta.data <- sc16_seu@meta.data [,c(1,2,3,4,5,21)]
seurat <- Seurat::MergeSeurat(object1 = fresh_seu, object2 = sc16_seu)
seurat <- FindVariableGenes(seurat)
seurat <- ScaleData(seurat)
seurat <- RunPCA(seurat, pc.genes = seurat@var.genes)
fresh2 <- as.SingleCellExperiment(seurat)
set.seed(12334);fresh2 <- runTSNE(fresh2)
plotTSNE(fresh2, colour_by = "PTPRC")
seurat <- NormalizeData(seurat)
seurat <- FindVariableGenes(seurat, x.low.cutoff = 0.0125, y.cutoff = 0.25, do.plot=FALSE)
seurat <- ScaleData(object = seurat, genes.use = seurat@var.genes)
seurat <- RunPCA(seurat, pc.genes = seurat@var.genes, pcs.print = 0)
seurat <- RunTSNE(seurat, dims.use = 1:10, verbose=TRUE)
DimElbowPlot(seurat)
seurat <- FindClusters(object = seurat, reduction.type = "pca", dims.use = 1:10,
resolution = 0.3, print.output = 0, save.SNN = TRUE, force.recalc = T)
TSNEPlot(object = seurat)
## pK Identification
sweep.res.list <- paramSweep(seurat, PCs = 1:10)
sweep.stats <- summarizeSweep(sweep.res.list, GT = FALSE)
bcmvn <- find.pK(sweep.stats) #0.07
## Homotypic Doublet Proportion Estimate
annotations <- seurat@ident
homotypic.prop <- modelHomotypic(annotations) ## ex: annotations <- seu_kidney@meta.data$ClusteringResults
nExp_poi <- round(0.01*length(seurat@cell.names)) ## 1% based on 2000 cells
nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
## Run DoubletFinder with varying classification stringencies
seurat <- doubletFinder(seurat, PCs = 1:10, pN = 0.25, pK = 0.07, nExp = nExp_poi, reuse.pANN = FALSE)
seurat <- doubletFinder(seurat, PCs = 1:10, pN = 0.25, pK = 0.07, nExp = nExp_poi.adj, reuse.pANN = "pANN_0.25_0.07_25")
# saveRDS(seurat,"rds/20190508Seurat_all_fresh_doublefinder.rds", compress = T)
seurat <- readRDS("../../scFT-paper_rds/20190508Seurat_all_fresh_doublefinder.rds")
## Plot results
seurat@meta.data$DF_hi.lo <- seurat@meta.data$DF.classifications_0.25_0.07_25
seurat@meta.data$DF_hi.lo[which(seurat@meta.data$DF_hi.lo == "Doublet" & seurat@meta.data$DF.classifications_0.25_0.07_21 == "Singlet")] <- "Doublet_lo"
seurat@meta.data$DF_hi.lo[which(seurat@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi"
TSNEPlot(seurat, group.by="DF_hi.lo", plot.order=c("Doublet_hi","Doublet_lo","Singlet"), colors.use=c("black","gold","red"))
seurat@meta.data$cell_type <- NA
seurat@meta.data$cell_type <- fresh$cell_type[match(rownames(seurat@meta.data), colnames(fresh))]
seurat@meta.data$cell_subtype <- secretory$clincluster_final[match(rownames(seurat@meta.data), colnames(secretory))]
# table(seurat@meta.data$cell_subtype, seurat@meta.data$DF_hi.lo)
secretory$DF_hi.lo <- seurat@meta.data$DF_hi.lo[match(colnames(secretory), rownames(seurat@meta.data))]
knitr::kable(table(secretory$DF_hi.lo[secretory$clincluster_final == "C8"]), caption = "The 40 cells in the EMT cluster are all singlet.")
| Var1 | Freq |
|---|---|
| Singlet | 40 |
sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] Seurat_2.3.4 Matrix_1.2-15
## [3] cowplot_0.9.4 DoubletFinder_2.0.1
## [5] bindrcpp_0.2.2 reshape2_1.4.3
## [7] scales_1.0.0 dplyr_0.7.8
## [9] edgeR_3.24.3 limma_3.38.3
## [11] scater_1.10.1 ggplot2_3.2.0.9000
## [13] SingleCellExperiment_1.4.1 SummarizedExperiment_1.12.0
## [15] DelayedArray_0.8.0 BiocParallel_1.16.5
## [17] matrixStats_0.54.0 Biobase_2.42.0
## [19] GenomicRanges_1.34.0 GenomeInfoDb_1.18.1
## [21] IRanges_2.16.0 S4Vectors_0.20.1
## [23] BiocGenerics_0.28.0
##
## loaded via a namespace (and not attached):
## [1] snow_0.4-3 backports_1.1.3
## [3] Hmisc_4.2-0 plyr_1.8.4
## [5] igraph_1.2.3 lazyeval_0.2.1
## [7] splines_3.5.2 digest_0.6.18
## [9] foreach_1.4.4 htmltools_0.3.6
## [11] viridis_0.5.1 lars_1.2
## [13] gdata_2.18.0 magrittr_1.5
## [15] checkmate_1.9.1 cluster_2.0.7-1
## [17] mixtools_1.1.0 ROCR_1.0-7
## [19] R.utils_2.7.0 colorspace_1.4-0
## [21] xfun_0.4 jsonlite_1.6
## [23] crayon_1.3.4 RCurl_1.95-4.11
## [25] bindr_0.1.1 zoo_1.8-4
## [27] survival_2.43-3 iterators_1.0.10
## [29] ape_5.2 glue_1.3.0
## [31] gtable_0.2.0 zlibbioc_1.28.0
## [33] XVector_0.22.0 kernlab_0.9-27
## [35] Rhdf5lib_1.4.2 prabclus_2.2-7
## [37] DEoptimR_1.0-8 HDF5Array_1.10.1
## [39] mvtnorm_1.0-8 bibtex_0.4.2
## [41] Rcpp_1.0.0 metap_1.1
## [43] dtw_1.20-1 viridisLite_0.3.0
## [45] htmlTable_1.13.1 reticulate_1.10
## [47] foreign_0.8-71 bit_1.1-14
## [49] proxy_0.4-22 mclust_5.4.2
## [51] SDMTools_1.1-221 Formula_1.2-3
## [53] tsne_0.1-3 htmlwidgets_1.3
## [55] httr_1.4.0 gplots_3.0.1.1
## [57] RColorBrewer_1.1-2 fpc_2.1-11.1
## [59] acepack_1.4.1 modeltools_0.2-22
## [61] ica_1.0-2 pkgconfig_2.0.2
## [63] R.methodsS3_1.7.1 flexmix_2.3-14
## [65] nnet_7.3-12 locfit_1.5-9.1
## [67] tidyselect_0.2.5 labeling_0.3
## [69] rlang_0.4.0 munsell_0.5.0
## [71] tools_3.5.2 ggridges_0.5.1
## [73] evaluate_0.13 stringr_1.4.0
## [75] yaml_2.2.0 kknn_1.3.1
## [77] npsurv_0.4-0 knitr_1.21
## [79] bit64_0.9-7 fitdistrplus_1.0-14
## [81] robustbase_0.93-3 caTools_1.17.1.1
## [83] purrr_0.3.0 RANN_2.6.1
## [85] pbapply_1.4-0 nlme_3.1-137
## [87] R.oo_1.22.0 hdf5r_1.0.1
## [89] compiler_3.5.2 rstudioapi_0.9.0
## [91] png_0.1-7 beeswarm_0.2.3
## [93] lsei_1.2-0 tibble_2.0.1
## [95] stringi_1.2.4 highr_0.7
## [97] lattice_0.20-38 trimcluster_0.1-2.1
## [99] pillar_1.3.1 lmtest_0.9-36
## [101] Rdpack_0.10-1 irlba_2.3.3
## [103] data.table_1.12.0 bitops_1.0-6
## [105] gbRd_0.4-11 R6_2.3.0
## [107] latticeExtra_0.6-28 KernSmooth_2.23-15
## [109] gridExtra_2.3 vipor_0.4.5
## [111] codetools_0.2-16 MASS_7.3-51.1
## [113] gtools_3.8.1 assertthat_0.2.0
## [115] rhdf5_2.26.2 withr_2.1.2
## [117] GenomeInfoDbData_1.2.0 diptest_0.75-7
## [119] doSNOW_1.0.16 grid_3.5.2
## [121] rpart_4.1-13 tidyr_0.8.2
## [123] class_7.3-15 rmarkdown_1.11
## [125] DelayedMatrixStats_1.4.0 segmented_0.5-3.0
## [127] Rtsne_0.15 base64enc_0.1-3
## [129] ggbeeswarm_0.6.0